Here, we assessed again multiple combinations of TSO, RT primer and RNA amounts, using a different stock of TSOs (PO_8268526), purchased earlier but apparently of better quality (see experiment 6), and with a more extensive randomisation of TSO barcodes and well coordinates (see designs 6a, 6b, 6c and 6d).
library("CAGEr")
library("ggplot2")
library("magrittr")
library("MultiAssayExperiment")
library("SummarizedExperiment")
library("viridis")
MOIRAI shortcuts
MISEQ_RUN <- "180517_M00528_0364_000000000-BRGK6"
WORKFLOW <- "OP-WORKFLOW-CAGEscan-short-reads-v2.1.2"
MOIRAI_STAMP <- "20180521173736"
MOIRAI_PROJ <- "project/Labcyte_test_decoy"
MOIRAI_USER <- "nanoCAGE2"
ASSEMBLY <- "mm9"
BASEDIR <- "/osc-fs_home/scratch/moirai"
MOIRAI_BASE <- file.path(BASEDIR, MOIRAI_USER)
MOIRAI_RESULTS <- file.path(MOIRAI_BASE, MOIRAI_PROJ, paste(MISEQ_RUN, WORKFLOW, MOIRAI_STAMP, sep = "."))
ce <- smallCAGEqc::loadMoiraiStats(
pipeline = WORKFLOW,
multiplex = file.path( MOIRAI_BASE, "input", paste0(MISEQ_RUN, ".multiplex.txt")),
summary = file.path( MOIRAI_RESULTS, "text", "summary.txt")) %>% DataFrame
ce$inputFiles <- paste0(MOIRAI_RESULTS, "/CAGEscan_fragments/", ce$samplename, ".bed")
# Discard lines for which input files do not exist.
ce <- ce[sapply(ce$inputFiles, file.exists),]
# Discard lines for which input files are empty.
ce <- ce[file.info(ce$inputFiles)$size != 0,]
ce$inputFilesType <- c("bed")
ce$sampleLabels <- as.character(ce$samplename)
ce <- ce[ce$group != "decoy",] # Temporary fix
ce
## DataFrame with 1520 rows and 16 columns
## samplename group barcode index total
## <factor> <factor> <factor> <factor> <numeric>
## CTGCGT_TAAGGCGA CTGCGT_TAAGGCGA TAAGGCGA CTGCGT TAAGGCGA 0
## GCTGCA_TAAGGCGA GCTGCA_TAAGGCGA TAAGGCGA GCTGCA TAAGGCGA 0
## CGATAC_TAAGGCGA CGATAC_TAAGGCGA TAAGGCGA CGATAC TAAGGCGA 0
## ACAGAT_TAAGGCGA ACAGAT_TAAGGCGA TAAGGCGA ACAGAT TAAGGCGA 0
## AGTAGC_TAAGGCGA AGTAGC_TAAGGCGA TAAGGCGA AGTAGC TAAGGCGA 0
## ... ... ... ... ... ...
## GTAGAT_TCGACGTC GTAGAT_TCGACGTC TCGACGTC GTAGAT TCGACGTC 0
## AGTCTC_TCGACGTC AGTCTC_TCGACGTC TCGACGTC AGTCTC TCGACGTC 0
## ATCTAC_TCGACGTC ATCTAC_TCGACGTC TCGACGTC ATCTAC TCGACGTC 0
## ACATAC_TCGACGTC ACATAC_TCGACGTC TCGACGTC ACATAC TCGACGTC 0
## CTGCAG_TCGACGTC CTGCAG_TCGACGTC TCGACGTC CTGCAG TCGACGTC 0
## extracted cleaned tagdust rdna spikes
## <numeric> <numeric> <numeric> <numeric> <numeric>
## CTGCGT_TAAGGCGA 2062 783 104 1175 0
## GCTGCA_TAAGGCGA 9540 7596 427 1517 0
## CGATAC_TAAGGCGA 2793 2080 166 544 3
## ACAGAT_TAAGGCGA 1130 541 70 519 0
## AGTAGC_TAAGGCGA 10119 6985 514 2617 3
## ... ... ... ... ... ...
## GTAGAT_TCGACGTC 1120 85 1027 7 1
## AGTCTC_TCGACGTC 35285 2846 32312 124 3
## ATCTAC_TCGACGTC 22731 1584 21098 49 0
## ACATAC_TCGACGTC 24903 2125 22686 90 2
## CTGCAG_TCGACGTC 10539 924 9546 69 0
## mapped properpairs counts
## <numeric> <numeric> <numeric>
## CTGCGT_TAAGGCGA 720 618 603
## GCTGCA_TAAGGCGA 7283 6074 5889
## CGATAC_TAAGGCGA 1982 1618 1571
## ACAGAT_TAAGGCGA 498 384 367
## AGTAGC_TAAGGCGA 6653 5526 5341
## ... ... ... ...
## GTAGAT_TCGACGTC 13 3 1
## AGTCTC_TCGACGTC 386 183 150
## ATCTAC_TCGACGTC 125 18 16
## ACATAC_TCGACGTC 263 122 88
## CTGCAG_TCGACGTC 178 77 47
## inputFiles
## <character>
## CTGCGT_TAAGGCGA /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/CTGCGT_TAAGGCGA.bed
## GCTGCA_TAAGGCGA /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/GCTGCA_TAAGGCGA.bed
## CGATAC_TAAGGCGA /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/CGATAC_TAAGGCGA.bed
## ACAGAT_TAAGGCGA /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/ACAGAT_TAAGGCGA.bed
## AGTAGC_TAAGGCGA /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/AGTAGC_TAAGGCGA.bed
## ... ...
## GTAGAT_TCGACGTC /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/GTAGAT_TCGACGTC.bed
## AGTCTC_TCGACGTC /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/AGTCTC_TCGACGTC.bed
## ATCTAC_TCGACGTC /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/ATCTAC_TCGACGTC.bed
## ACATAC_TCGACGTC /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/ACATAC_TCGACGTC.bed
## CTGCAG_TCGACGTC /osc-fs_home/scratch/moirai/nanoCAGE2/project/Labcyte_test_decoy/180517_M00528_0364_000000000-BRGK6.OP-WORKFLOW-CAGEscan-short-reads-v2.1.2.20180521173736/CAGEscan_fragments/CTGCAG_TCGACGTC.bed
## inputFilesType sampleLabels
## <character> <character>
## CTGCGT_TAAGGCGA bed CTGCGT_TAAGGCGA
## GCTGCA_TAAGGCGA bed GCTGCA_TAAGGCGA
## CGATAC_TAAGGCGA bed CGATAC_TAAGGCGA
## ACAGAT_TAAGGCGA bed ACAGAT_TAAGGCGA
## AGTAGC_TAAGGCGA bed AGTAGC_TAAGGCGA
## ... ... ...
## GTAGAT_TCGACGTC bed GTAGAT_TCGACGTC
## AGTCTC_TCGACGTC bed AGTCTC_TCGACGTC
## ATCTAC_TCGACGTC bed ATCTAC_TCGACGTC
## ACATAC_TCGACGTC bed ACATAC_TCGACGTC
## CTGCAG_TCGACGTC bed CTGCAG_TCGACGTC
Using transfer designs 6a, 6b, 6c and 6d.
plate <- rbind(
cbind(read.table("plate6a.txt", sep = "\t", header=TRUE, stringsAsFactors=FALSE), repl="Q")
, cbind(read.table("plate6b.txt", sep = "\t", header=TRUE, stringsAsFactors=FALSE), repl="R")
, cbind(read.table("plate6c.txt", sep = "\t", header=TRUE, stringsAsFactors=FALSE), repl="S")
, cbind(read.table("plate6d.txt", sep = "\t", header=TRUE, stringsAsFactors=FALSE), repl="T"))
stopifnot(identical(plate, plate[!duplicated(plate),]))
ce %<>% cbind(plate[match( paste(ce$barcode, ce$index)
, paste(plate$BARCODE_SEQ, plate$INDEX)), ])
ce$index %<>% factor(levels = unique(plate$INDEX)) # Keep original order of indexes.
ce$plateID <- ce$repl # Define plate IDs
rownames(ce) %<>% paste(ce$plateID, sep = "_")
ce$sampleLabels <- rownames(ce)
rm(plate)
if(file.exists(paste0("Labcyte-RT_Data_Analysis_", expNumber, ".Rds"))) {
ce <- readRDS(paste0("Labcyte-RT_Data_Analysis_", expNumber, ".Rds"))
} else {
getCTSS(ce, useMulticore = TRUE)
removeStrandInvaders(ce)
}
Collect annotations and gene symbols via a local GENCODE file (mm9 GENCODE not available in AnnotationHub)
if(file.exists(paste0("Labcyte-RT_Data_Analysis_", expNumber, ".Rds"))) {
print("Annotated data loaded from file")
} else {
annotateCTSS(ce, rtracklayer::import.gff("/osc-fs_home/scratch/gmtu/annotation/mus_musculus/gencode-M1/gencode.vM1.annotation.gtf.gz"))
}
## [1] "Annotated data loaded from file"
saveRDS(ce, paste0("Labcyte-RT_Data_Analysis_", expNumber, ".Rds"))
Custom scopes displaying strand invasion artefacts.
source("customScopes.R", echo = TRUE)
##
## > msScope_qcSI <- function(libs) {
## + libs$Tag_dust <- libs$extracted - libs$rdna - libs$spikes -
## + libs$cleaned
## + libs$rDNA <- libs$r .... [TRUNCATED]
##
## > msScope_counts <- function(libs) {
## + libs$Promoter <- libs$promoter
## + libs$Exon <- libs$exon
## + libs$Intron <- libs$intron
## + libs$Int .... [TRUNCATED]
##
## > msScope_libSizeNormByBarcode <- function(libs) {
## + libs$Yield <- libs$libSizeNormByBarcode
## + list(libs = libs, columns = c("Yield"), total = .... [TRUNCATED]
##
## > msScope_libSizeNormByIndex <- function(libs) {
## + libs$Yield <- libs$libSizeNormByIndex
## + list(libs = libs, columns = c("Yield"), total = lib .... [TRUNCATED]
##
## > msScope_libSize <- function(libs) {
## + libs$Yield <- libs$librarySizes
## + list(libs = libs, columns = c("Yield"), total = libs$Yield)
## + }
Some negative controls did not give any sequence and therefore are not in the CAGEr object at all.
summary(ce$RNA_vol == 0)
## Mode FALSE TRUE
## logical 1506 14
The signal in negative controls is higher than before in experiment 5 and experiment 6. Could that suggest index swapping? Not really because in this experiment, a specific set of barcodes was chosen for negative controls, therefore they are either missed or correct. Actually, a lot are missed. However, this is not entirely surprising since only weak signal is expected.
ce$NC <- ce$RNA_vol == 0
ggpubr::ggarrange( legend = "right", common.legend = TRUE,
plotAnnot( ce, scope = msScope_qcSI, group = "NC"
, title = NULL, facet = "index", normalise = FALSE) +
facet_wrap("facet", ncol = 1) +
ylab("sequence counts") + xlab("Negative control ?"),
plotAnnot( ce, scope = msScope_qcSI, group = "NC"
, title = NULL, facet = "index", normalise = TRUE) +
facet_wrap("facet", ncol = 1) +
ylab("Normalised to 100%") + xlab("Negative control ?")
) %>% ggpubr::annotate_figure(top="QC report, by replicate set")
The negative controls of TCCTGAGC, GGACTCCT, AAGAGGCA, GCTCATGA, CGGAGCCT, TACGCTGCm, ACTGAGCG, CCTAAGAC and CGATCAGT have a higher promoter rate thatn the other samples; this is strange. It was not the case in experiment 6.
ggpubr::ggarrange( legend = "right", common.legend = TRUE,
plotAnnot( ce, scope = msScope_counts, group = "NC"
, title = NULL, facet = "index", normalise = FALSE) +
facet_wrap("facet", ncol = 1) +
ylab("sequence counts") + xlab("Negative control ?"),
plotAnnot( ce, scope = msScope_counts, group = "NC"
, title = NULL, facet = "index", normalise = TRUE) +
facet_wrap("facet", ncol = 1) +
ylab("Normalised to 100%") + xlab("Negative control ?")
) %>% ggpubr::annotate_figure(top="QC annotation, by replicate set")
To ease data handling, the negative controls with no RNA are removed. By design all of these negative controls had 10 μM TSO.
colData(ce[,ce$NC]) %>% data.frame %>% summary
## harmonizing input:
## removing 1506 sampleMap rows with 'colname' not in colnames of experiments
## removing 1506 colData rownames not in sampleMap 'primary'
## samplename group barcode index total
## ACATCT_TGCAGCTA:1 AAGAGGCA:1 CGATCT :3 TCCTGAGC:1 Min. :0
## ATCAGC_AAGAGGCA:1 ACTGAGCG:1 GTATCT :3 GGACTCCT:1 1st Qu.:0
## ATCAGC_CTCTCTAC:1 ATCTCAGG:1 ATCAGC :2 CTCTCTAC:1 Median :0
## CACGCA_CCTAAGAC:1 ATGCGCAG:1 CACGCA :2 AAGAGGCA:1 Mean :0
## CACGCA_TACGCTGC:1 CCTAAGAC:1 TCGAGC :2 GCTCATGA:1 3rd Qu.:0
## CGATCT_ATCTCAGG:1 CGATCAGT:1 ACATCT :1 ATCTCAGG:1 Max. :0
## (Other) :8 (Other) :8 (Other):1 (Other) :8
## extracted cleaned tagdust rdna
## Min. : 259.0 Min. : 59.0 Min. : 12.0 Min. : 6.00
## 1st Qu.: 469.8 1st Qu.:138.0 1st Qu.: 138.2 1st Qu.: 37.25
## Median : 587.0 Median :178.5 Median : 250.5 Median : 49.50
## Mean : 735.1 Mean :199.6 Mean : 425.2 Mean :109.93
## 3rd Qu.: 884.0 3rd Qu.:228.2 3rd Qu.: 485.0 3rd Qu.:128.00
## Max. :1890.0 Max. :380.0 Max. :1577.0 Max. :518.00
##
## spikes mapped properpairs counts
## Min. :0.0000 Min. : 10.00 Min. : 4.00 Min. : 4.00
## 1st Qu.:0.0000 1st Qu.: 68.75 1st Qu.: 46.75 1st Qu.: 43.25
## Median :0.0000 Median :120.50 Median : 92.50 Median : 86.50
## Mean :0.3571 Mean :135.50 Mean :106.00 Mean :101.57
## 3rd Qu.:0.7500 3rd Qu.:147.25 3rd Qu.:117.75 3rd Qu.:115.50
## Max. :2.0000 Max. :356.00 Max. :300.00 Max. :298.00
##
## inputFiles inputFilesType sampleLabels
## Length:14 Length:14 Length:14
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## well row col sxt
## Length:14 Length:14 Min. : 1.00 Length:14
## Class :character Class :character 1st Qu.:12.00 Class :character
## Mode :character Mode :character Median :15.00 Mode :character
## Mean :14.57
## 3rd Qu.:19.50
## Max. :24.00
##
## BARCODE_ID TSO_source TSO_vol TSO RT_PRIMERS
## Min. : 8.00 Min. :25 Min. :25 Min. :1.25 Min. :1
## 1st Qu.:32.00 1st Qu.:25 1st Qu.:25 1st Qu.:1.25 1st Qu.:1
## Median :44.00 Median :25 Median :25 Median :1.25 Median :1
## Mean :52.57 Mean :25 Mean :25 Mean :1.25 Mean :1
## 3rd Qu.:80.00 3rd Qu.:25 3rd Qu.:25 3rd Qu.:1.25 3rd Qu.:1
## Max. :92.00 Max. :25 Max. :25 Max. :1.25 Max. :1
##
## RT_PRIMERS_vol MASTER_MIX_vol INDEX RNA RNA_vol
## Min. :25 Min. :350 Length:14 Min. :0 Min. :0
## 1st Qu.:25 1st Qu.:350 Class :character 1st Qu.:0 1st Qu.:0
## Median :25 Median :350 Mode :character Median :0 Median :0
## Mean :25 Mean :350 Mean :0 Mean :0
## 3rd Qu.:25 3rd Qu.:350 3rd Qu.:0 3rd Qu.:0
## Max. :25 Max. :350 Max. :0 Max. :0
##
## H2O_vol BARCODE_SEQ RNA_level RTP_level
## Min. :100 Length:14 Length:14 Length:14
## 1st Qu.:100 Class :character Class :character Class :character
## Median :100 Mode :character Mode :character Mode :character
## Mean :100
## 3rd Qu.:100
## Max. :100
##
## plateID repl librarySizes strandInvaders promoter
## Q:2 Q:2 Min. : 4.00 Min. : 0.000 Min. : 4.00
## R:4 R:4 1st Qu.: 41.75 1st Qu.: 2.000 1st Qu.: 33.00
## S:3 S:3 Median : 83.00 Median : 3.000 Median : 61.00
## T:5 T:5 Mean : 97.93 Mean : 3.643 Mean : 76.79
## 3rd Qu.:108.50 3rd Qu.: 4.000 3rd Qu.: 87.50
## Max. :288.00 Max. :10.000 Max. :218.00
##
## exon intron unknown NC
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Mode:logical
## 1st Qu.: 2.250 1st Qu.: 0.250 1st Qu.: 4.25 TRUE:14
## Median : 4.000 Median : 2.000 Median : 8.50
## Mean : 5.214 Mean : 3.143 Mean :12.79
## 3rd Qu.: 8.000 3rd Qu.: 5.250 3rd Qu.:13.75
## Max. :13.000 Max. :12.000 Max. :48.00
##
ce.bak <- ce
ce <- ce[, ! ce$NC]
## harmonizing input:
## removing 14 sampleMap rows with 'colname' not in colnames of experiments
## removing 14 colData rownames not in sampleMap 'primary'
Discrepancy between replicates:
CGAGGCTG has an excess artefacts (tag dust) compared to same libraries from other plates.TAGGCATG has a different profile compared to other plates, but its amount of tag dust clearly indicates that it is correctly identified as a “1 pg” library. Same for GCTCATGA in the 10 pg libraries.The following barcodes were found in libraries where they were not supposed to be used:
ATCAGC in ACTCGCTA, GCGTAGTA, TAGCGCTC, AGGCAGAA, GGAGCTAC, GTAGAGGA, CGTACTAG and TAAGGCGA.ACATCT in TAGGCATG.GTATCT in CGAGGCTG.In experiment 4, swapped indexes were CGTACTAG, AGGCAGAA and TAGGCATG, CTCTCTAC.
plotAnnot( ce, scope = msScope_qcSI, group = "RT_PRIMERS", normalise = FALSE
, title = NULL, facet = "index") + ylab("Number of read pairs") +
xlab(NULL) + facet_wrap(~facet, ncol = 6, scale = "free")
plotAnnot( ce, scope = msScope_qcSI, group = "RT_PRIMERS", normalise = TRUE
, title = NULL, facet = "index") + ylab("Number of read pairs") +
xlab(NULL) + facet_wrap(~facet, ncol = 6, scale = "fixed")
plotAnnot( ce, scope = msScope_qcSI, group = "TSO", normalise = FALSE
, title = NULL, facet = "index") + ylab("Number of read pairs") +
xlab(NULL) + facet_wrap(~facet, ncol = 6, scale = "free")
plotAnnot( ce, scope = msScope_qcSI, group = "index", normalise = FALSE
, title = NULL, facet = "plateID") + ylab("Number of read pairs") +
xlab(NULL) + facet_wrap(~facet, ncol = 4, scale = "free")
Barcodes 34 and 7 gave more sequences than average, but their normalised QC profile or their annotation profile does not stand out
Tier_A <- c( 3, 15, 27, 39, 51, 63, 75
, 10, 22, 34, 46, 58, 70, 82
, 11, 23, 35, 47, 59, 71, 83)
Tier_B <- c( 1, 13, 25, 37, 49, 61, 73
, 2, 14, 26, 38, 50, 62, 74
, 7, 19, 31, 43, 55, 67, 79)
Tier_C <- c( 4, 16, 28, 40, 52, 64, 76
, 5, 17, 29, 41, 65, 77, 89
, 6, 18, 30, 42, 66, 78, 90)
Tier_N <- c( 8, 20, 32, 44, 68, 80, 92)
ce$BN <- factor(ce$BARCODE_ID, levels = c(Tier_A, Tier_B, Tier_C, Tier_N))
plotAnnot( ce, scope = msScope_qcSI, group = "repl"
, title = "Sequence counts"
, facet = "BN", normalise = FALSE) +
facet_wrap(~facet, ncol=7)
plotAnnot( ce, scope = msScope_qcSI, group = "repl"
, title = "Sequence counts (normalised)"
, facet = "BN", normalise = TRUE) +
facet_wrap(~facet, ncol=7)
plotAnnot( ce, scope = msScope_counts, group = "repl"
, title = "Annotation counts"
, facet = "BN", normalise = FALSE) +
facet_wrap(~facet, ncol=7)
Strangely, libraries made with no RT primers have a QC profile that is not dramatically different from other libraries. This might again be explained by contaminations, although the amount of sequences in the “no RT primer” samples is a bit high for such an explanation.
ggpubr::ggarrange(legend = "right", common.legend = TRUE,
plotAnnot( ce, scope = msScope_qcSI, group = "RT_PRIMERS", normalise = FALSE
, title = NULL) + ylab("Number of read pairs") + xlab(NULL),
plotAnnot( ce, scope = msScope_qcSI, group = "RT_PRIMERS", normalise = TRUE
, title = NULL) + ylab("Normalised to 100 %") + xlab(NULL)) %>%
ggpubr::annotate_figure(top="QC control, by amount of RT primers (in μM)")
ggpubr::ggarrange(legend = "right", common.legend = TRUE,
plotAnnot( ce, scope = msScope_counts, group = "RT_PRIMERS", normalise = FALSE
, title = NULL) + ylab("Number of read pairs") + xlab(NULL),
plotAnnot( ce, scope = msScope_counts, group = "RT_PRIMERS", normalise = TRUE
, title = NULL) + ylab("Normalised to 100 %") + xlab(NULL)) %>%
ggpubr::annotate_figure(top="QC annotation, by amount of RT primers (in μM)")
Consistency of RTP-negative controls (still looking for index swap)
plotAnnot( ce[,ce$TSO == 10], scope = msScope_qcSI, group = "RT_PRIMERS", normalise = FALSE
, title = NULL, facet = "index") + ylab("Number of read pairs") +
xlab(NULL) + facet_wrap(~facet, ncol = 6, scale = "fixed")
## harmonizing input:
## removing 1338 sampleMap rows with 'colname' not in colnames of experiments
## removing 1338 colData rownames not in sampleMap 'primary'
## Warning: Removed 1344 rows containing missing values (geom_segment).
## Warning: Removed 1344 rows containing missing values (geom_point).
To ease data handling (for instance when working with primer ratios), the negative controls with no primers are removed.
ce <- ce[,ce$RT_PRIMERS != 0]
## harmonizing input:
## removing 215 sampleMap rows with 'colname' not in colnames of experiments
## removing 215 colData rownames not in sampleMap 'primary'
Highest TSO concentrations give largest molecule counts, except at the highest RNA concentration (200 ng / µL)
Large number of tag artefacts are produced at lowest RNA concentrations (20 or 1 pg / µL). Note that the libraries have not been size-selected on purpose, to estimate the amount of artefacts.
rRNA rate peaks at the highest TSO concentrations, and has another peak at intermediate concentrations (~2.5 µM)
Strand invation seems to depend more on RNA quantity than on TSO concentration.
-> let’s do the next experiment at 10 µM final (200 µM stock), like in the nanoCAGE 2017 protocol. Slightly higher concentrations may have been better but we need to consider that our stock plate is soon running out. In addition, the actual TSO concentrations in this experiment are lower than the expected value, according to the survey of the concentrations of the stock plate PO_8268526.
ggpubr::ggarrange(legend = "right",
plotAnnot( ce, scope = msScope_qcSI, group = "TSO", facet="RNA"
, normalise = FALSE, title = "Yield") +
ylab("Number of read pairs") + xlab("TSO") +
facet_wrap(~facet, ncol = 1),
plotAnnot( ce, scope = msScope_qcSI, group = "TSO", facet="RNA"
, normalise = TRUE, title = "QC report") +
ylab("Normalised to 100 %") + xlab(NULL) +
facet_wrap(~facet, ncol = 1)) %>%
ggpubr::annotate_figure(top="QC control, by TSO concentration and amount of RNA (ng)")
At low RNA concentrations, rRNA rate decreases with RT primer concentration…
at high RNA concentrations, molecule counts only increase marginally with RT primer concentration.
-> Let’s do the next experiment at 2 µM.
ggpubr::ggarrange(legend = "right",
plotAnnot( ce, scope = msScope_qcSI, group = "RT_PRIMERS", facet="RNA"
, normalise = FALSE, title = "Yield") +
ylab("Number of read pairs") + xlab("TSO") +
facet_wrap(~facet, ncol = 1),
plotAnnot( ce, scope = msScope_qcSI, group = "RT_PRIMERS", facet="RNA"
, normalise = TRUE, title = "QC report") +
ylab("Normalised to 100 %") + xlab(NULL) +
facet_wrap(~facet, ncol = 1)) %>%
ggpubr::annotate_figure(top="QC control, by RTP concentration and amount of RNA (ng)")
If not removed during library preparation, oligonucleotide artifacts strongly dominate libraries prepared with 1 pg RNA. In general, the amount of artefacts increases when starting RNA amounts decrease. Here, adding RT primers increases artefacts. In contrary, and somewhat surprisingly, adding TSOs seem to reduce artefacts.
Sub panel at 10,000 pg is noisy because replicate CGAGGCTG is an outlier with a large amount of artefacts.
ggplot(colData(ce) %>% data.frame, aes(TSO, tagdust / extracted * 100, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[RTP] (µM)") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = ce$TSO %>% unique %>% sort) +
scale_y_continuous("Tag dust (% of extracted reads)") +
ggtitle("Increasing TSO molarity stronlgy reduces artifact amounts.")
## `geom_smooth()` using method = 'loess'
Plot where all RT primer concentrations are pooled, showing influence of RNA mass and TSO concentration:
ggplot(colData(ce) %>% data.frame, aes(TSO, tagdust / extracted * 100, color=RNA %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed", nrow = 1) +
scale_color_viridis(discrete = TRUE, name = "RNA (pg)", option = "magma") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.625, 2.500, 10.000, 40.000, 160.000)) +
scale_y_continuous("Tag dust (% of extracted reads)") +
ggtitle("Increasing TSO molarity stronlgy reduces artifact amounts.")
## `geom_smooth()` using method = 'loess'
Same with all RNA amount overlayed in a single plot.
ggplot(colData(ce) %>% data.frame, aes(TSO, tagdust / extracted * 100, color=RNA %>% factor)) +
geom_point() +
geom_smooth() +
scale_color_viridis(discrete = TRUE, name = "RNA (pg)", option = "magma") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.625, 2.500, 10.000, 40.000, 160.000)) +
scale_y_continuous("Tag dust (% of extracted reads)") +
ggtitle("TSO molarity stronlgy influences artifact amounts.")
## `geom_smooth()` using method = 'loess'
ce$rRNA_rate <- ce$rdna / (ce$extracted - ce$tagdust)
ggplot(colData(ce) %>% data.frame, aes(TSO, rRNA_rate, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[RTP] (µM)") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("rRNA (% of non-tagdust extracted reads)") +
ggtitle("TSO molarity stronlgy modulates rRNA amounts.")
## `geom_smooth()` using method = 'loess'
Because we multiplexed reactions together, the ones with the highest yield will give the largest amount of reads. Higher yield gives the possibility of reducing the number of PCR cycles.
Since multiplexing is not perfect, each library had a different number of reads. Therefore, to compare yields in terms of number of aligned reads, etc, one needs to normalise per indexed library.
tapply(ce$librarySizes, ce$index, sum)
## TAAGGCGA CGTACTAG AGGCAGAA TCCTGAGC GGACTCCT TAGGCATG CTCTCTAC CGAGGCTG
## 129884 104142 115271 63777 33080 2063 228370 141531
## AAGAGGCA GTAGAGGA GCTCATGA ATCTCAGG ACTCGCTA GGAGCTAC GCGTAGTA CGGAGCCT
## 123337 134703 90912 20575 127701 116273 121044 123633
## TACGCTGC ATGCGCAG TAGCGCTC ACTGAGCG CCTAAGAC CGATCAGT TGCAGCTA TCGACGTC
## 27205 3363 127514 131305 94921 76723 27828 3490
indexMean <- tapply(ce$librarySizes, ce$index, mean)
ce$libSizeNormByIndex <-
mapply( FUN = function(n, index) n / indexMean[index]
, n = ce$librarySizes
, index = ce$index)
RT primer molarity mildly influences yield. Higher molarities are needed when TSO molarity is increased. Conversely, high molarities are detrimental for low TSO amounts. In brief, the RT primer concentration must be adjusted to the TSO concentration.
ggplot(colData(ce) %>% data.frame, aes(RT_PRIMERS, libSizeNormByIndex, color=TSO %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[TSO] (µM)", option = "magma") +
scale_x_log10( "RT primer molarity (µM)"
, breaks = ce$RT_PRIMERS %>% unique %>% sort) +
scale_y_log10( "Normalised counts (arbitrary scale)"
, breaks = c(0.01, 0.1, 1, 10)) +
ggtitle("RT primer concentration mildly influences yield.")
## `geom_smooth()` using method = 'loess'
Since the trend appears true for all RNA concentrations, the following figure pools all data.
ggplot(colData(ce) %>% data.frame, aes(RT_PRIMERS, libSizeNormByIndex, color=TSO %>% factor)) +
geom_point() +
geom_smooth() +
# facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[TSO] (µM)", option = "magma") +
scale_x_log10( "RT primer molarity (µM)"
, breaks = ce$RT_PRIMERS %>% unique %>% sort) +
scale_y_log10( "Normalised counts (arbitrary scale)"
, breaks = c(0.01, 0.1, 1, 10)) +
ggtitle("RT primer concentration mildly influences yield.")
## `geom_smooth()` using method = 'loess'
The percent of proper pairs aligned indicate the amount of data that goes in the analysis. The rest is basically discarded. Here, we do not take tag dust artefact into account, assuming that they can be removed before sequencing.
The results are somewhat symmetric with the rRNA rate, since rRNA reads are a large proportion of the discarded data.
ce$mapping_rate <- ce$properpairs / (ce$extracted - ce$tagdust) * 100
ggplot(colData(ce) %>% data.frame, aes(TSO, mapping_rate, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[RTP] (µM)") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("Mapping rate (% of extracted reads)") +
ggtitle("TSO molarity stronlgy influences mapping rate.")
## `geom_smooth()` using method = 'loess'
Strand invasion artefacts are also discarded, but at a later step. In this experiment, their amount was reasonably low.
Interestingly, the amount of strand invaders was minimised by high amounts of TSOs and RT primers. Does that mean that strand invasion happen first, and then template-switching happens if primers remain ?
ce$strand_invasion_rate <- ce$strandInvaders / (ce$counts + ce$strandInvaders) * 100
ggplot(colData(ce) %>% data.frame, aes(TSO, strand_invasion_rate, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[RTP] (µM)") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("Strand invasion rate (% of molecule counts)") +
ggtitle("Oligonucleotide molarity stronlgy influences strand invasion rate.")
## `geom_smooth()` using method = 'loess'
High promoter rate is THE goal of a CAGE experiment. The molarity of RT primer does not seem to matter much. Promoter rate reaches optimum at TSO molarities higher than 10 µM.
ce$promoter_rate <- ce$promoter / (ce$counts) * 100
ggplot(colData(ce) %>% data.frame, aes(TSO, promoter_rate, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "[RTP] (µM)") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("Promoter rate (% of molecule counts after SI removal)") +
ggtitle("Promoter rate reaches optimum at high TSO molarity.")
## `geom_smooth()` using method = 'loess'
Low TSO molarities are much more detrimental for promoter rate at high RNA concentrations.
ggplot(colData(ce) %>% data.frame, aes(TSO, promoter_rate, color=RNA %>% factor)) +
geom_point() +
geom_smooth() +
# facet_wrap(~RNA, scales = "fixed") +
scale_color_viridis(discrete = TRUE, name = "RNA (pg)", option = "magma") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("Promoter rate (% of molecule counts after SI removal)") +
ggtitle("Promoter rate reaches optimum at high TSO molarity.")
## `geom_smooth()` using method = 'loess'
Using a free plotting scale as richness reaches extremely low values at 1 pg RNA.
Many libraries were too shallowly sequenced to allow to calculate richness on a scale of 100.
Richness is higher when RT primer molarity is higher.
CTSStoGenes(ce)
ce$r10g <- vegan::rarefy(t(assay(ce[["geneExpMatrix"]])),10)
## Warning in vegan::rarefy(t(assay(ce[["geneExpMatrix"]])), 10): Requested
## 'sample' was larger than smallest site maximum (0)
ce$r10g[ce$counts < 10] <- NA
ggplot(colData(ce) %>% data.frame, aes(TSO, r10g, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "free") +
scale_color_viridis(discrete = TRUE, name = "[RTP] (µM)") +
scale_x_log10( "TS oligonucleotide molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("Gene richness (on a scale of 10)") +
ggtitle("Higher RT primer concentration give higher richness.")
## `geom_smooth()` using method = 'loess'
## Warning: Removed 62 rows containing non-finite values (stat_smooth).
## Warning: Removed 62 rows containing missing values (geom_point).
On a scale of 100, we see that TSO concentration needs to be matched with RNA amounts.
CTSStoGenes(ce)
ce$r100g <- vegan::rarefy(t(assay(ce[["geneExpMatrix"]])),100)
ce$r100g[ce$counts < 100] <- NA
ggplot(colData(ce) %>% data.frame, aes(TSO, r100g, color=RT_PRIMERS %>% factor)) +
geom_point() +
geom_smooth() +
facet_wrap(~RNA, scales = "free") +
scale_color_viridis(discrete = TRUE, name = "[TSO] (µM)") +
scale_x_log10( "TSO molarity (µM)"
, breaks = c(0.6, 2.5, 10, 40, 160)) +
scale_y_continuous("Gene richness (on a scale of 100)") +
ggtitle("Higher RT primer concentration give higher richness.")
## `geom_smooth()` using method = 'loess'
CTSStoGenes(ce)
ce$r100g <- vegan::rarefy(t(assay(ce[["geneExpMatrix"]])),100)
## Warning in vegan::rarefy(t(assay(ce[["geneExpMatrix"]])), 100): Requested
## 'sample' was larger than smallest site maximum (0)
ce$r100g[ce$counts < 100] <- NA
ggplot(colData(ce) %>% data.frame, aes(RT_PRIMERS, r100g, color=RNA %>% factor)) +
geom_point() +
geom_smooth() +
scale_color_viridis(discrete = TRUE, name = "RNA (pg)", option = "magma") +
scale_x_log10( "RT primer molarity (µM)"
, breaks = ce$RT_PRIMERS %>% unique %>% sort) +
scale_y_continuous("Gene richness (on a scale of 100)") +
ggtitle("Higher RT primer concentration give higher richness.")
## `geom_smooth()` using method = 'loess'
## Warning: Removed 309 rows containing non-finite values (stat_smooth).
## Warning: Removed 309 rows containing missing values (geom_point).
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.7.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.7.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] viridis_0.4.0 viridisLite_0.2.0
## [3] SummarizedExperiment_1.9.14 DelayedArray_0.4.1
## [5] matrixStats_0.52.2 Biobase_2.38.0
## [7] GenomicRanges_1.31.19 GenomeInfoDb_1.15.5
## [9] IRanges_2.13.26 S4Vectors_0.17.32
## [11] BiocGenerics_0.25.3 MultiAssayExperiment_1.5.41
## [13] magrittr_1.5 ggplot2_2.2.1
## [15] CAGEr_1.21.5.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-131 bitops_1.0-6
## [3] RColorBrewer_1.1-2 rprojroot_1.3-2
## [5] tools_3.4.3 backports_1.1.2
## [7] R6_2.2.2 vegan_2.4-5
## [9] platetools_0.0.2 KernSmooth_2.23-15
## [11] lazyeval_0.2.1 mgcv_1.8-22
## [13] colorspace_1.3-2 permute_0.9-4
## [15] gridExtra_2.3 compiler_3.4.3
## [17] VennDiagram_1.6.18 rtracklayer_1.39.9
## [19] labeling_0.3 scales_0.5.0
## [21] stringr_1.3.0 digest_0.6.15
## [23] Rsamtools_1.31.3 rmarkdown_1.9
## [25] stringdist_0.9.4.6 XVector_0.19.8
## [27] pkgconfig_2.0.1 htmltools_0.3.6
## [29] BSgenome_1.47.5 rlang_0.2.0
## [31] VGAM_1.0-4 bindr_0.1
## [33] BiocParallel_1.12.0 gtools_3.5.0
## [35] dplyr_0.7.4 RCurl_1.95-4.10
## [37] GenomeInfoDbData_0.99.1 futile.logger_1.4.3
## [39] smallCAGEqc_0.12.2.999999 Matrix_1.2-12
## [41] Rcpp_0.12.16 munsell_0.4.3
## [43] stringi_1.1.7 yaml_2.1.18
## [45] MASS_7.3-47 zlibbioc_1.24.0
## [47] plyr_1.8.4 grid_3.4.3
## [49] gdata_2.18.0 lattice_0.20-35
## [51] cowplot_0.9.2 Biostrings_2.47.9
## [53] splines_3.4.3 knitr_1.20
## [55] beanplot_1.2 pillar_1.2.1
## [57] ggpubr_0.1.6 reshape2_1.4.2
## [59] codetools_0.2-15 futile.options_1.0.0
## [61] XML_3.98-1.9 glue_1.2.0
## [63] evaluate_0.10.1 lambda.r_1.2
## [65] data.table_1.10.4-3 gtable_0.2.0
## [67] purrr_0.2.4 tidyr_0.7.2
## [69] reshape_0.8.7 assertthat_0.2.0
## [71] tibble_1.4.2 som_0.3-5.1
## [73] GenomicAlignments_1.15.12 memoise_1.1.0
## [75] bindrcpp_0.2 cluster_2.0.6